Student Performance

Dataset link

link = https://archive.ics.uci.edu/dataset/320/student+performance

About The Data

The datasets student-mat.csv and student-por.csv contain information about students enrolled in Math and Portuguese language courses. The datasets include various attributes related to student demographics, academic background, and social factors.

Key attributes include:

School: The student’s school (“GP” for Gabriel Pereira, “MS” for Mousinho da Silveira).

Sex: Gender (“F” for female, “M” for male).

Age: Age (numeric, 15–22).

Family Background: Attributes like family size, parents’ education, and cohabitation status.

Parental Jobs: Occupation of the mother and father.

Study and Activity Attributes: Study time, extracurricular activities, school attendance, and romantic relationships.

Health & Social Behaviors: Alcohol consumption, health status, family relationships, and free time.

Grades: G1, G2, and G3 represent the first period, second period, and final grades (numeric, from 0 to 20).

Installing needed packages

# install.packages("ucimlrepo")
# install.packages("dplyr")
# install.packages("mlbench")
# install.packages("fastDummies")
# install.packages("caret")

Fetching the dataset from the uci

library(ucimlrepo)
library(dplyr)
library(mlbench)

# Fetching dataset with ID 320(student Performance dataset)
student_performance <- fetch_ucirepo(id = 320)

# Accessing the feature(X) and target(Y)
X <- student_performance$data$features
Y <- student_performance$data$targets

# combining both dataset
data <- cbind(X,Y)
head(data)
##   school sex age address famsize Pstatus Medu Fedu     Mjob     Fjob     reason
## 1     GP   F  18       U     GT3       A    4    4  at_home  teacher     course
## 2     GP   F  17       U     GT3       T    1    1  at_home    other     course
## 3     GP   F  15       U     LE3       T    1    1  at_home    other      other
## 4     GP   F  15       U     GT3       T    4    2   health services       home
## 5     GP   F  16       U     GT3       T    3    3    other    other       home
## 6     GP   M  16       U     LE3       T    4    3 services    other reputation
##   guardian traveltime studytime failures schoolsup famsup paid activities
## 1   mother          2         2        0       yes     no   no         no
## 2   father          1         2        0        no    yes   no         no
## 3   mother          1         2        0       yes     no   no         no
## 4   mother          1         3        0        no    yes   no        yes
## 5   father          1         2        0        no    yes   no         no
## 6   mother          1         2        0        no    yes   no        yes
##   nursery higher internet romantic famrel freetime goout Dalc Walc health
## 1     yes    yes       no       no      4        3     4    1    1      3
## 2      no    yes      yes       no      5        3     3    1    1      3
## 3     yes    yes      yes       no      4        3     2    2    3      3
## 4     yes    yes      yes      yes      3        2     2    1    1      5
## 5     yes    yes       no       no      4        3     2    1    2      5
## 6     yes    yes      yes       no      5        4     2    1    2      5
##   absences G1 G2 G3
## 1        4  0 11 11
## 2        2  9 11 11
## 3        6 12 13 12
## 4        0 14 14 14
## 5        0 11 13 13
## 6        6 12 12 13
# Looking into data
str(data)
## 'data.frame':    649 obs. of  33 variables:
##  $ school    : chr  "GP" "GP" "GP" "GP" ...
##  $ sex       : chr  "F" "F" "F" "F" ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : chr  "U" "U" "U" "U" ...
##  $ famsize   : chr  "GT3" "GT3" "LE3" "GT3" ...
##  $ Pstatus   : chr  "A" "T" "T" "T" ...
##  $ Medu      : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu      : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ Mjob      : chr  "at_home" "at_home" "at_home" "health" ...
##  $ Fjob      : chr  "teacher" "other" "other" "services" ...
##  $ reason    : chr  "course" "course" "other" "home" ...
##  $ guardian  : chr  "mother" "father" "mother" "mother" ...
##  $ traveltime: int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ schoolsup : chr  "yes" "no" "yes" "no" ...
##  $ famsup    : chr  "no" "yes" "no" "yes" ...
##  $ paid      : chr  "no" "no" "no" "no" ...
##  $ activities: chr  "no" "no" "no" "yes" ...
##  $ nursery   : chr  "yes" "no" "yes" "yes" ...
##  $ higher    : chr  "yes" "yes" "yes" "yes" ...
##  $ internet  : chr  "no" "yes" "yes" "yes" ...
##  $ romantic  : chr  "no" "no" "no" "yes" ...
##  $ famrel    : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : int  4 2 6 0 0 6 0 2 0 0 ...
##  $ G1        : int  0 9 12 14 11 12 13 10 15 12 ...
##  $ G2        : int  11 11 13 14 13 12 12 13 16 12 ...
##  $ G3        : int  11 11 12 14 13 13 13 13 17 13 ...
# Looking if their is any missing value
sum(is.na(data))
## [1] 0

Converting some column into Factor

library(dplyr)

data <- data %>%
  mutate(across(c(school, sex, famsize, Pstatus, Mjob, Fjob, reason, guardian, schoolsup, famsup, paid, activities, nursery, higher, internet, romantic), as.factor))

str(data)
## 'data.frame':    649 obs. of  33 variables:
##  $ school    : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex       : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : chr  "U" "U" "U" "U" ...
##  $ famsize   : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
##  $ Pstatus   : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Medu      : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu      : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ Mjob      : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
##  $ Fjob      : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
##  $ reason    : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
##  $ guardian  : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
##  $ traveltime: int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
##  $ famsup    : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ paid      : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ nursery   : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ higher    : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ internet  : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
##  $ romantic  : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ famrel    : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : int  4 2 6 0 0 6 0 2 0 0 ...
##  $ G1        : int  0 9 12 14 11 12 13 10 15 12 ...
##  $ G2        : int  11 11 13 14 13 12 12 13 16 12 ...
##  $ G3        : int  11 11 12 14 13 13 13 13 17 13 ...

Statistical Analysis

Correlation Analysis

library(dplyr)

corr_matrix <- data %>%
  select(age, Medu, Fedu, traveltime, failures, studytime,
         famrel, freetime, goout, Dalc, Walc, health, absences,
         G1, G2, G3) %>%
  cor()

# Show only the correlation of each variable with G3
corr_with_G3 <- corr_matrix[, "G3"]
print(corr_with_G3)
##         age        Medu        Fedu  traveltime    failures   studytime 
## -0.10650539  0.24015076  0.21179968 -0.12717297 -0.39331555  0.24978869 
##      famrel    freetime       goout        Dalc        Walc      health 
##  0.06336113 -0.12270493 -0.08764072 -0.20471940 -0.17661887 -0.09885124 
##    absences          G1          G2          G3 
## -0.09137906  0.82638712  0.91854800  1.00000000
  • Strong Positive Coorelations : G1 , G2 meaning students who perform well early in the course are highly likely to maintain their performance
  • Moderate Positive Influences : Studytime , Mother’s Education , Father’s Education meaning students with more study and more educated parents tends to perform better.
  • Moderate to Strong Negative Correlation : Past Failures meaning students who have failed before are significantly more likely to have lower final grades.
  • Week Negative Influences : Age , Travel Time , Dalc , Walc , Freetime. Health , Absences , Going out meaning they show a slight negative relationship
## corrplot 0.95 loaded

P-value Interpretation Statistical Significance

ANOVA Analysis

# Analysis of G3 and School
summary(aov(G3 ~ school, data = data))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## school        1    547   546.6   56.89 1.57e-13 ***
## Residuals   647   6217     9.6                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and famsize
summary(aov(G3 ~ famsize , data = data))
##              Df Sum Sq Mean Sq F value Pr(>F)
## famsize       1     14   13.71   1.314  0.252
## Residuals   647   6750   10.43
# Analysis of G3 and famsize
summary(aov(G3 ~ Pstatus , data = data))
##              Df Sum Sq Mean Sq F value Pr(>F)
## Pstatus       1      0   0.004       0  0.985
## Residuals   647   6763  10.453
# Analysis of G3 and sex
summary(aov(G3 ~ sex, data = data))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## sex           1    113  112.68   10.96 0.000982 ***
## Residuals   647   6651   10.28                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and Mjob
summary(aov(G3 ~ Mjob , data = data))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Mjob          4    296   74.01    7.37 8.31e-06 ***
## Residuals   644   6467   10.04                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and Fjob
summary(aov(G3 ~ Fjob , data = data))
##              Df Sum Sq Mean Sq F value Pr(>F)  
## Fjob          4    135   33.68   3.273 0.0114 *
## Residuals   644   6629   10.29                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and reason
summary(aov(G3 ~ reason , data = data))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## reason        3    308  102.57   10.25 1.34e-06 ***
## Residuals   645   6456   10.01                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and schoolsup
summary(aov(G3 ~ schoolsup , data = data))
##              Df Sum Sq Mean Sq F value Pr(>F)  
## schoolsup     1     30   29.82   2.866  0.091 .
## Residuals   647   6733   10.41                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and fampsu
summary(aov(G3 ~ famsup , data = data))
##              Df Sum Sq Mean Sq F value Pr(>F)
## famsup        1     24   23.71   2.276  0.132
## Residuals   647   6740   10.42
# Analysis of G3 and paid
summary(aov(G3 ~ paid, data = data))
##              Df Sum Sq Mean Sq F value Pr(>F)
## paid          1     20   20.38   1.956  0.162
## Residuals   647   6743   10.42
# Analysis of G3 and activities
summary(aov(G3 ~ activities , data = data))
##              Df Sum Sq Mean Sq F value Pr(>F)
## activities    1     24   24.18   2.321  0.128
## Residuals   647   6739   10.42
# Analysis of G3 and nursery
summary(aov(G3 ~ nursery , data = data))
##              Df Sum Sq Mean Sq F value Pr(>F)
## nursery       1      6   5.591   0.535  0.465
## Residuals   647   6758  10.445
# Analysis of G3 and higher
summary(aov(G3 ~ higher , data = data))
##              Df Sum Sq Mean Sq F value Pr(>F)    
## higher        1    746   746.2   80.24 <2e-16 ***
## Residuals   647   6017     9.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and internet
summary(aov(G3 ~ internet , data = data))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## internet      1    152  152.22    14.9 0.000125 ***
## Residuals   647   6611   10.22                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and romantic
summary(aov(G3 ~ romantic , data = data))
##              Df Sum Sq Mean Sq F value Pr(>F)  
## romantic      1     55   55.49   5.353  0.021 *
## Residuals   647   6708   10.37                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Visualizing the Significant columns

library(ggplot2)

ggplot(data , aes(x= school , y = G3 , fill = school)) +
  geom_boxplot()

ggplot(data , aes(x= sex , y = G3 , fill = sex)) +
  geom_boxplot()

ggplot(data , aes(x= Mjob , y = G3 , fill = Mjob)) +
  geom_boxplot()

ggplot(data , aes(x= Fjob , y = G3 , fill = Fjob)) +
  geom_boxplot()

ggplot(data , aes(x= reason , y = G3 , fill = reason)) +
  geom_boxplot()

ggplot(data , aes(x= higher , y = G3 , fill = higher)) +
  geom_boxplot()

ggplot(data , aes(x= internet , y = G3 , fill = internet)) +
  geom_boxplot()

ggplot(data , aes(x= romantic , y = G3 , fill = romantic)) +
  geom_boxplot()

One-hot Encoding

library(fastDummies)
library(dplyr)

encoded_data <- dummy_cols(data , select_columns = c("internet","higher","romantic"),remove_selected_columns = TRUE,remove_first_dummy = TRUE)
colnames(encoded_data)
##  [1] "school"       "sex"          "age"          "address"      "famsize"     
##  [6] "Pstatus"      "Medu"         "Fedu"         "Mjob"         "Fjob"        
## [11] "reason"       "guardian"     "traveltime"   "studytime"    "failures"    
## [16] "schoolsup"    "famsup"       "paid"         "activities"   "nursery"     
## [21] "famrel"       "freetime"     "goout"        "Dalc"         "Walc"        
## [26] "health"       "absences"     "G1"           "G2"           "G3"          
## [31] "internet_yes" "higher_yes"   "romantic_yes"
encoded_data <- encoded_data %>%
  select(-famsize,-Pstatus,-schoolsup,-famsup,-paid,-activities,-nursery)

Splitting Data into training 80% and testing 20%

library(caret)
## Loading required package: lattice
set.seed(123)

# Data into 80% and 20%
split <- createDataPartition(encoded_data$G3 , p = 0.8 , list = FALSE)

train_data <- encoded_data[split, ]
test_data <- encoded_data[-split, ]

Checking the Assumption of Linear Regression Model

model <- lm(G3 ~ ., data = train_data)
residuals <- resid(model)

# Histogram of residuals
hist(residuals)

# Q-Q plot for normality
qqnorm(residuals)
qqline(residuals)

Training Linear Regression Model

LM_Model <- train(
  G3 ~ . ,
  data = train_data,
  method = "lm"
)

summary(LM_Model)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8352 -0.5189 -0.0036  0.6036  5.0036 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.762731   1.041534   0.732  0.46433    
## schoolMS         -0.289032   0.143670  -2.012  0.04480 *  
## sexM             -0.095076   0.128155  -0.742  0.45852    
## age               0.018109   0.052968   0.342  0.73259    
## addressU          0.131752   0.142023   0.928  0.35403    
## Medu             -0.095752   0.081812  -1.170  0.24242    
## Fedu              0.059187   0.072831   0.813  0.41681    
## Mjobhealth        0.302100   0.290155   1.041  0.29832    
## Mjobother        -0.198507   0.161067  -1.232  0.21838    
## Mjobservices      0.122871   0.196158   0.626  0.53135    
## Mjobteacher       0.186508   0.260118   0.717  0.47371    
## Fjobhealth       -0.500919   0.417590  -1.200  0.23090    
## Fjobother        -0.450198   0.248414  -1.812  0.07056 .  
## Fjobservices     -0.572066   0.260882  -2.193  0.02879 *  
## Fjobteacher      -0.607158   0.363081  -1.672  0.09512 .  
## reasonhome       -0.157382   0.150565  -1.045  0.29642    
## reasonother      -0.509493   0.197128  -2.585  0.01004 *  
## reasonreputation -0.237993   0.158982  -1.497  0.13505    
## guardianmother   -0.106169   0.142556  -0.745  0.45678    
## guardianother     0.262006   0.280541   0.934  0.35080    
## traveltime        0.173910   0.084351   2.062  0.03977 *  
## studytime         0.018560   0.074135   0.250  0.80242    
## failures         -0.340216   0.116323  -2.925  0.00361 ** 
## famrel           -0.050596   0.064879  -0.780  0.43586    
## freetime         -0.060300   0.059297  -1.017  0.30971    
## goout             0.014457   0.057324   0.252  0.80100    
## Dalc             -0.008894   0.083555  -0.106  0.91527    
## Walc             -0.033729   0.062925  -0.536  0.59219    
## health           -0.031120   0.041640  -0.747  0.45520    
## absences          0.014339   0.013128   1.092  0.27528    
## G1                0.141916   0.044157   3.214  0.00140 ** 
## G2                0.870382   0.039524  22.021  < 2e-16 ***
## internet_yes      0.071404   0.144487   0.494  0.62140    
## higher_yes        0.196222   0.215570   0.910  0.36314    
## romantic_yes      0.044044   0.121725   0.362  0.71763    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.271 on 485 degrees of freedom
## Multiple R-squared:  0.8633, Adjusted R-squared:  0.8537 
## F-statistic:  90.1 on 34 and 485 DF,  p-value: < 2.2e-16
predictions <- predict(LM_Model , newdata = test_data)
actuals <- test_data$G3
postResample(pred = predictions, obs = actuals)
##      RMSE  Rsquared       MAE 
## 1.1827269 0.8290849 0.7861066
# Plotting Actual vs Predicted
plot(test_data$G3 , predictions,
     xlab = "Actual G3",
     ylab = "Predicted G3",
     main = "Actual vs Predicted Linear Regression",
     pch = 19, col = "blue")
abline(0, 1, col = "red")  # reference line)

Using Cross-Validation with Elastic Net Regression

train_control <- trainControl(
  method = "cv",
  number = 10,
)

GLM_Model <- train(
  G3 ~ . ,
  data = train_data,
  method = "glmnet",
  trControl = train_control,
  metric = c("RMSE")
)
print(GLM_Model)
## glmnet 
## 
## 520 samples
##  25 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 469, 467, 467, 468, 470, 469, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda       RMSE      Rsquared   MAE      
##   0.10   0.006103841  1.263307  0.8661025  0.8416234
##   0.10   0.061038413  1.257684  0.8663480  0.8360885
##   0.10   0.610384126  1.299546  0.8602287  0.8459695
##   0.55   0.006103841  1.261507  0.8663817  0.8402252
##   0.55   0.061038413  1.238777  0.8691543  0.8224813
##   0.55   0.610384126  1.302952  0.8680741  0.8264794
##   1.00   0.006103841  1.259085  0.8666304  0.8381556
##   1.00   0.061038413  1.232570  0.8696969  0.8188343
##   1.00   0.610384126  1.353099  0.8682164  0.8804954
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.06103841.
# Access final results
best_model_results <- GLM_Model$results[
  GLM_Model$results$alpha == GLM_Model$bestTune$alpha &
  GLM_Model$results$lambda == GLM_Model$bestTune$lambda,
]

# Print RMSE, R-squared, and MAE
cat("Best RMSE:", best_model_results$RMSE, "\n")
## Best RMSE: 1.23257
cat("Best R-squared:", best_model_results$Rsquared, "\n")
## Best R-squared: 0.8696969
cat("Best MAE:", best_model_results$MAE, "\n")
## Best MAE: 0.8188343
predictions_GLM <- predict(GLM_Model , newdata = test_data)
actuals_GLM <- test_data$G3
postResample(pred = predictions_GLM, obs = actuals_GLM)
##      RMSE  Rsquared       MAE 
## 1.1346729 0.8397974 0.7292078
# Plotting Actual vs Predicted
plot(test_data$G3 , predictions_GLM,
     xlab = "Actual G3",
     ylab = "Predicted G3",
     main = "Actual vs Predicted GLMNET",
     pch = 19, col = "blue")
abline(0, 1, col = "red")  # reference line)

Using Support Vector Machine

library(e1071)
library(quantmod)
library(kernlab)

svm_model <- train(G3 ~ . ,
                   data = train_data,
                   method = "svmRadial",
                   trControl = train_control,
                   preProcess = c("center","scale"),
                   tuneLength = 5)

svm_linear <- train(G3 ~ . ,
                   data = train_data,
                   method = "svmLinear",
                   trControl = train_control,
                   preProcess = c("center","scale"),
                   tuneLength = 5)

print(svm_model)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 520 samples
##  25 predictor
## 
## Pre-processing: centered (34), scaled (34) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 468, 467, 468, 468, 468, 469, ... 
## Resampling results across tuning parameters:
## 
##   C     RMSE      Rsquared   MAE      
##   0.25  1.735138  0.7742168  1.0406233
##   0.50  1.626734  0.7887121  0.9768147
##   1.00  1.544052  0.8006845  0.9536633
##   2.00  1.500242  0.8040384  0.9683193
##   4.00  1.508932  0.7968974  1.0049898
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01765752
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01765752 and C = 2.
#Best Model
print(svm_linear)
## Support Vector Machines with Linear Kernel 
## 
## 520 samples
##  25 predictor
## 
## Pre-processing: centered (34), scaled (34) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 467, 466, 469, 467, 469, 469, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE      
##   1.263597  0.8525412  0.7958404
## 
## Tuning parameter 'C' was held constant at a value of 1
# Predict on test data
pred_svm <- predict(svm_linear, newdata = test_data)
actual_svm <- test_data$G3
postResample(pred = pred_svm, obs = actual_svm)
##      RMSE  Rsquared       MAE 
## 1.1124462 0.8444150 0.7324652
# Ploting Actual vs Predicted
plot(test_data$G3 , pred_svm,
     xlab = "Actual Point",
     ylab = "Predicited Point",
     main = "Actual vs Predicted SVM",
     pch = 19, col = "blue")
abline(0, 1, col = "red")

Using K-Nearest Neighbour

knn_model <- train(
  G3~ .,
  data = train_data,
  method = "knn",
  trControl = train_control,
  preProcess = c("center","scale"),
  tuneLength = 10 # k = 1 to 10
)

print(knn_model)
## k-Nearest Neighbors 
## 
## 520 samples
##  25 predictor
## 
## Pre-processing: centered (34), scaled (34) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 467, 466, 468, 468, 470, 470, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  2.301127  0.5577571  1.645487
##    7  2.297209  0.5767190  1.625549
##    9  2.300054  0.5894665  1.610932
##   11  2.294154  0.5963439  1.617475
##   13  2.309007  0.5971723  1.627464
##   15  2.316585  0.6107240  1.627697
##   17  2.323524  0.6164017  1.630080
##   19  2.308500  0.6339417  1.622317
##   21  2.317659  0.6393866  1.633124
##   23  2.307599  0.6533244  1.629211
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
plot(knn_model)

# prediction
pred_knn <- predict(knn_model, newdata = test_data)
actual_knn <- test_data$G3

postResample(pred = pred_knn, obs = actual_knn)
##      RMSE  Rsquared       MAE 
## 2.0234546 0.4931356 1.4763918
# Plotting Actual vs Predicted Point
plot(actual_knn , pred_knn,
     xlab = "Actual Point",
     ylab = "Predicted Point",
     main = "Actual vs Predicted KNN",
     pch = 19, col = "blue")
abline(0, 1, col = "red")

Using Decision Tree Model

library(rpart)
library(rpart.plot)
library(caret)

DT_model <- train(G3 ~ .,
                  data = train_data,
                  method = "rpart",
                  trControl = train_control,
                  preProcess = c("center","scale"),
                  tuneLength = 10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
print(DT_model)
## CART 
## 
## 520 samples
##  25 predictor
## 
## Pre-processing: centered (34), scaled (34) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 467, 468, 469, 468, 468, 469, ... 
## Resampling results across tuning parameters:
## 
##   cp           RMSE      Rsquared   MAE      
##   0.002726407  1.410894  0.8203079  0.9045836
##   0.003648967  1.440506  0.8129849  0.9198853
##   0.005194941  1.436188  0.8133545  0.9115749
##   0.013105992  1.464972  0.8057678  0.9369913
##   0.016933022  1.514198  0.7924949  1.0073416
##   0.019677337  1.531827  0.7885625  1.0279373
##   0.054730025  1.763985  0.7337188  1.1847924
##   0.087782927  1.908757  0.6795113  1.3507883
##   0.146849557  2.221369  0.5691557  1.5603428
##   0.512011506  2.722852  0.4918340  1.9699223
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.002726407.
rpart.plot(DT_model$finalModel)

# prediction
pred_DT <- predict(DT_model, newdata = test_data)
actual_DT <- test_data$G3

postResample(pred = pred_DT, obs = actual_DT)
##      RMSE  Rsquared       MAE 
## 1.1715342 0.8280618 0.6960317
# Plotting Actual vs Predicted
plot(actual_DT , pred_DT,
     xlab = "Actual Point",
     ylab = "Predicted Point",
     main = "Actual vs Predicted DT",
     pch = 19, col = "blue")
abline(0, 1, col = "red")

Using Random Forest Model

library(randomForest)

RF_model <- train(G3~ . ,
                  data = train_data,
                  method = "rf",
                  trControl = train_control,
                  preProcess = c("center","scale"),
                  tuneLength = 5)
print(RF_model)
## Random Forest 
## 
## 520 samples
##  25 predictor
## 
## Pre-processing: centered (34), scaled (34) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 469, 468, 469, 468, 468, 468, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE      
##    2    2.012521  0.7373867  1.3318100
##   10    1.338777  0.8553028  0.8641057
##   18    1.296631  0.8611280  0.8411014
##   26    1.293885  0.8596675  0.8463808
##   34    1.319895  0.8517951  0.8586288
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 26.
plot(RF_model)

# prediction
pred_RF <- predict(RF_model , newdata = test_data)
actual_RF <- test_data$G3
postResample(pred = pred_RF , obs = actual_RF)
##      RMSE  Rsquared       MAE 
## 1.1549628 0.8341529 0.7232408
# Plotting Actual vs Predicted
plot(actual_RF , pred_RF,
     xlab = "Actual Point",
     ylab = "Predicted Point",
     main = "Actual vs Predicted RF",
     pch = 19, col = "blue")
abline(0, 1, col = "red")

Using XGBoost Model

XGB_model <- train(
    G3 ~ . , 
    data = train_data,
    method = "xgbTree",
    trControl = train_control,
    preProcess = c("center","scale"),
    tuneLength = 5
)
# Extract the best training performance
best_metrics <- XGB_model$results[
  XGB_model$results$nrounds == XGB_model$bestTune$nrounds &
  XGB_model$results$max_depth == XGB_model$bestTune$max_depth &
  XGB_model$results$eta == XGB_model$bestTune$eta &
  XGB_model$results$gamma == XGB_model$bestTune$gamma &
  XGB_model$results$colsample_bytree == XGB_model$bestTune$colsample_bytree &
  XGB_model$results$min_child_weight == XGB_model$bestTune$min_child_weight &
  XGB_model$results$subsample == XGB_model$bestTune$subsample,
]

print(best_metrics)
##    eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 41 0.3         1     0              0.8                1     0.875      50
##        RMSE Rsquared       MAE    RMSESD RsquaredSD    MAESD
## 41 1.320218 0.846094 0.8551095 0.4410115 0.07151767 0.158168
# View the best tuning parameters
XGB_model$bestTune
##    nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 41      50         1 0.3     0              0.8                1     0.875
plot(XGB_model)

# prediction
pred_XGB <- predict(XGB_model , newdata = test_data)
actual_XGB <- test_data$G3
postResample(pred = pred_XGB , obs = actual_XGB)
##      RMSE  Rsquared       MAE 
## 1.1975426 0.8209977 0.7572757